How to get rid of the bad python code

In [19]:
import json
import datetime
import tqdm

folder = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/outfiles'

lines = []

guess_and_parse = {}
guess_not_parse = {}
parse_not_guess = {}
total = {}
dates = []
for file_num in tqdm.tqdm(range(400)):
    filename = folder + '/file' + str(file_num) + '.txt'
    for line in open(filename):
        line_obj = json.loads(line)
        for code_block in line_obj['CodeBlocks']:
            l = int(round(len(code_block['code']),-1))
            if l > 10000:
                l = 10000
            gl = code_block['Guesslang']
            par = code_block['Parsable']
            if par == "True" or gl.strip().lower() == "python":
                if l not in total:
                    total[l] = []
            if par == "True" and gl.strip().lower() == "python":
                if l not in guess_and_parse:
                    guess_and_parse[l] = []
            elif par == "True" and not gl.strip().lower() == "python":
                if l not in parse_not_guess:
                    parse_not_guess[l] = []
            elif not par == "True" and gl.strip().lower() == "python":
                if l not in guess_not_parse:
                    guess_not_parse[l] = []

In [22]:
import random
desired_code_length = round(100,-1)
for i in range(50):
    print('Guess AND parse')
    print('Number to choose from',len(guess_and_parse[desired_code_length]))
    print(random.sample(guess_and_parse[desired_code_length], 1)[0])
#print('Guess NOT parse')

for i in range(50):
    print('Number to choose from',len(guess_not_parse[desired_code_length]))
    print(random.sample(guess_not_parse[desired_code_length], 1)[0])
#    print('Parse NOT Guess')
#    print('Number to choose from',len(parse_not_guess[desired_code_length]))
#    print('-'*5)
#    print(random.sample(parse_not_guess[desired_code_length], 1)[0])
#    print('*'*100)

Number to choose from 23890
    webBrowser1.Document.Write("<BODY background= D:\\Desktop\\123.jpg bgColor=#ffffff text=#000000>");

Number to choose from 23890
update a
set a.company_id = b.company_id
from vendorRegkeys a, users b
where a.createdby_id = b.user_id

Number to choose from 23890
In []: p= 2* rand(3, 1e4)- 1
In []: p= p[:, sum(p* p, 0)** .5<= 1]
In []: p.shape
Out[]: (3, 5216)

Number to choose from 23890
jQuery('body').text().replace(/\s{2,9999}/g, ' ')

Number to choose from 23890
  sum1 = 0
  for i = 1:K
  sum1 = sum1 + Y(k,i) *log(Htheta(k)) + (1 - Y(k,i))*log(1-Htheta(k))

Number to choose from 23890
cannot select option, no option with text '20120905' in select box 'date' (Capybara::ElementNotFound)

Number to choose from 23890
return this.patientRepository.FindAll(spc).OrderBy(a => a.Id).Skip(start).Take(limit).ToList();

Number to choose from 23890
Dim lst1 as new list(of integer)
Dim lst2 as new list(of integer)
Dim lst3 as new list(of integer)

Number to choose from 23890
ssh user@host 'cat - > /tmp/file.ext; do_something_with /tmp/file.ext;rm /tmp/file.ext' < file.ext 

Number to choose from 23890
Mockito.when(nrClient.uploadFiles("DF49ACBC8", Matchers.anyList(), "dl"))

Number to choose from 23890
var s = "";
for ( x in { 3:3, 1:1 } ) { s += x }
if ( s === "31" ) alert( 'JSC' )
else alert( 'V8' )

Number to choose from 23890
Warning: ftp_put() [function.ftp-put]: Filename invalid in D:\xampp\htdocs\mycloud\edit.php on line 7

Number to choose from 23890
if (arr[item] == 'a' || arr[item] == 'e' || arr[item] == 'i' || arr[item] == 'o' || arr[item] == 'u')

Number to choose from 23890
Info := 'destination=' + UrlEncode(EmailDestAddressFromIni) +
  '&' + 'Nicebody=' + UrlEncode(Nicebody);

Number to choose from 23890
select t1.* 
from table1 t1 join table2 t2 on
group by
having count(*)<2

Number to choose from 23890
  .style.backgroundImage="url( 'img/"+ clicked_id+".jpg' )";

Number to choose from 23890
def foo(a):
    def bar(a):
        a -= 1
        return a
    return bar(a)
>>> print foo(5) 

Number to choose from 23890
puts res.inspect # show the nested bits via  stdout
                 # or through some other logging

Number to choose from 23890
def mapper(k,v_list):
  for v in v_list:
    if criteria:
      write to HDFS

Number to choose from 23890
rails g teacher name age:integer email sex course 
#replaced class with course, now its working great.

Number to choose from 23890
@Html.HiddenFor(x => x.StartDate)  // if you don't want to display it or use the display for to show it

Number to choose from 23890

    or with attribute

$("#content").wrap("<table id='wrapper'>")

Number to choose from 23890
>>> import Tkinter #tkinter
>>> root = Tkinter.Tk()
>>>'package require Tkhtml')

Number to choose from 23890
from xx in table
where (from yy in string[] 
       select yy).Contains(xx.uid.ToString())
select xx

Number to choose from 23890
select top 6 Client_Country, count(*) Total
from table group by Client_Country
order by total desc

Number to choose from 23890
... inner join Containstable (fulltextTable, mycolumn, ?) as KeyTable on id = KeyTable.[KEY] ...

Number to choose from 23890
frame.setLocation(0,0) is top left.
frame.setLocation(0,700) moves it as close as i can to the bottom

Number to choose from 23890
def getmonthname:
    months = ['January', 'February', ... , 'December']
    return months[month - 1]

Number to choose from 23890
for (i in 1:(n-1))
  for (j in (i+1):n)
    cat(sprintf("(%g,%g)\n", i, j))
## (1,2)
## (1,3)
## (2,3)

Number to choose from 23890
Npgsql.EF6, version 2.0.12-pre4(Prerelease)
Npgsql, version
EntityFramework, version 6.0.2

Number to choose from 23890

>>> p = re.compile('.*', re.DEBUG)
max_repeat 0 65535
  any None

Number to choose from 23890
#set ($themeDisplay = $httpServletRequest.getAttribute("THEME_DISPLAY"))

Number to choose from 23890
undefined method `serial_number' for #<User:0x000000060b1d40>
extracted source (around line #8):

Number to choose from 23890
t1 =
t2 =
puts t1 == t2    # Says False
puts t1.eql?(t2) # Says False

Number to choose from 23890
cell.backgroundColor = UIColor.colorWithRed(125/255.0, green: 125/255.0, blue: 125/255.0, alpha: 1.0)  

Number to choose from 23890
org.jdesktop.jdic.init.JdicInitException: java.lang.UnsatisfiedLinkError: no jdic in java.library.path

Number to choose from 23890
View a report: Run report with html output -> save output to /tmp/abcd/ -> embedd it into your webapp.

Number to choose from 23890
; file one 
10     0.2   0.5   0.3
20     0.1   0.6   0.8
30     0.2   0.1   0.1
40     0.1   0.5   0.3

Number to choose from 23890
if (last row is selected)
    add a new row to the table

invoke the default down arrow action

Number to choose from 23890

>>> p = re.compile('.*', re.DEBUG)
max_repeat 0 65535
  any None

Number to choose from 23890
new_list = [f1, f2, datetime.datetime.strptime(f3, '%m/%d/%Y').date()
    for f1, f2, f3 in old_list]

Number to choose from 23890
    ' this was hiding the form as soon as it shows:

Number to choose from 23890
Integer.metaClass.gimmeAP = {->return 'p'}
assert 3.gimmeAP() == 'p'

Number to choose from 23890
sed -i".bak" "s:'export LD_PRELOAD="/usr/lib/ /usr/lib/"'::" ~/.bashrc

Number to choose from 23890

Number to choose from 23890
UPDATE omc_product
SET image = CONCAT('assets/', image), thumbnail = CONCAT('assets/', thumbnail)

Number to choose from 23890
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Number to choose from 23890
import csv

data = csv.reader(open('c:\x\list.csv' ))

for row in data:



Number to choose from 23890
normalization_factor = float(sum(p.values()))
for key, value in p:
p[key] = value/normalization_factor

Number to choose from 23890
to merge all the changes applied on it (assume there is no conflict) without opening the docx file.

Handlabeling code

In [15]:
# These are the actual counts for each category (true-true, true-false, false-true)
#For 100:
#48, 8, 16
#For 500:
#50, 19, 12
#For 1000:
#50, 13, 7

In [16]:
# These are the percentages for each category for convenience (true-true, true-false, false-true)
#For 100:
#96, 16, 32
#For 500:
#100, 38, 24
#For 1000:
#100, 26, 14


Given the above percentages, we can calculate how many of the code blocks we think are python, and how many of the python code blocks are captured by the first category (true-true)

In [1]:
# Number of code blocks per category
python_true = 1695196
python_false =  1471445
other_false = 2234022

In [13]:
# Taking the min probability of each group, and the max probability of each group to get a possible range
max_python = python_true*1 + python_false*.38 + other_false*.32
min_python = python_true*.96 + python_false*.16 + other_false*.14

In [10]:


In [11]:
# Taking the ratio of python code to total code -- about half of it isnt python at all!
print(max_python / (python_true+python_false+other_false))
print(min_python / (python_true+python_false+other_false))


In [14]:
# Taking the ratio of python code captured by the first group, and total estimated python code
print(python_true / max_python)
print(python_true / min_python)


In [17]:
# Conclusion:
# Although this method of estimation may be a bit crude, it gives us a range that I'm pretty sure about, and I would
# estimate that the true ratio of python code captured is around 65% or more

In [24]:
# Qualitative information
# python - false (this is when guesslang says python but it does not parse)
#2  JSON
#17 Other programming languages
#18 python 
#3  ipython shell
#10 trace 

# other - true (this is when it parses but guesslang says something other than python)
#34 JSON
#9  python
#3  other programming languages

In [ ]: